In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_boston
from sklearn import linear_model
In [2]:
%matplotlib inline
# If you are using IPython, this will make the images available in the notebook
In [3]:
np.set_printoptions(precision=5, suppress=True) # sets float output to 5 decimals
In [4]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.ix[:,:-1]
y = dataset['target'].values
In [5]:
yq = np.array(y>25, dtype=int)
In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
linear_regression.fit(X,y)
print ("coefficients: %s\nintercept: %0.3f" % (linear_regression.coef_,linear_regression.intercept_))
In [7]:
dataset.min()
Out[7]:
In [8]:
centering = StandardScaler(with_mean=True, with_std=False)
linear_regression.fit(centering.fit_transform(X),y)
print ("coefficients: %s\nintercept: %0.3f" % (linear_regression.coef_,linear_regression.intercept_))
In [9]:
print ('mean: %0.3f' % np.mean(y))
In [10]:
standardization = StandardScaler(with_mean=True, with_std=True)
linear_regression.fit(standardization.fit_transform(X),y)
print ("coefficients: %s\nintercept: %0.3f" % (linear_regression.coef_,linear_regression.intercept_))
In [11]:
scaling = MinMaxScaler(feature_range=(0, 1))
linear_regression.fit(scaling.fit_transform(X),y)
print ("coefficients: %s\nintercept: %0.3f" % (linear_regression.coef_,linear_regression.intercept_))
In [12]:
import statsmodels.api as sm
Xq = sm.add_constant(standardization.fit_transform(X))
logit = sm.Logit(yq, Xq)
result = logit.fit()
print (result.summary())
In [13]:
print ('odd ratios of coefficients: %s' % np.exp(result.params))
In [14]:
def sigmoid(p):
return 1 / (1 + np.exp(-p))
print ('intercept: %0.3f' % result.params[0])
print ('probability of value above 25 when all predictors are average: %0.3f' % sigmoid(result.params[0]))
In [15]:
print ('average likelihood of positive response: %0.3f' % (sum(yq) / float(len(yq))))
In [16]:
C = np.ones(len(X))
logit = sm.Logit(yq, C)
result = logit.fit()
print (result.summary())
print ('\nprobability of value above 25 using just a constant: %0.3f' % sigmoid(result.params[0]))
In [17]:
outlook = ['sunny', 'overcast', 'rainy']
temperature = ['hot', 'mild', 'cool']
humidity = ['high', 'normal']
windy = ['TRUE', 'FALSE']
weather_dataset = list()
for o in outlook:
for t in temperature:
for h in humidity:
for w in windy:
weather_dataset.append([o,t,h,w])
play = [0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1]
In [18]:
import pandas as pd
df = pd.DataFrame(weather_dataset, columns=['outlook', 'temperature', 'humidity', 'windy'])
In [19]:
print (pd.get_dummies(df.humidity).ix[:5,:])
In [20]:
dummy_encoding = pd.get_dummies(df)
In [21]:
import statsmodels.api as sm
X = sm.add_constant(dummy_encoding)
logit = sm.Logit(play, X)
result = logit.fit()
print (result.summary())
In [22]:
X.drop(['outlook_sunny', 'temperature_mild', 'humidity_normal', 'windy_FALSE'], inplace=True, axis=1)
logit = sm.Logit(play, X)
result = logit.fit()
print (result.summary())
In [23]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = False)
dict_representation = [{varname:var for var, varname in zip(row,['outlook', 'temperature', 'humidity', 'windy'])} for row in weather_dataset]
print (dict_representation[0])
print (vectorizer.fit_transform(dict_representation))
In [24]:
print (vectorizer.feature_names_)
In [25]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
label_encoder = LabelEncoder()
print (label_encoder.fit_transform(df.outlook))
In [26]:
label_encoder.inverse_transform([0,1,2])
Out[26]:
In [27]:
print (label_encoder.classes_)
In [28]:
label_binarizer = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
print (label_binarizer.fit_transform(label_encoder.fit_transform(df.outlook)))
In [29]:
your_text = 'Nomina sunt consequentia rerum'
mapping_words_in_text = {word:position for position, word in enumerate(set(your_text.lower().split(' ')))}
print (mapping_words_in_text)
In [30]:
corpus = ['The quick fox jumped over the lazy dog', 'I sought a dog wondering around with a bird', 'My dog is named Fido']
In [31]:
from sklearn.feature_extraction.text import CountVectorizer
textual_one_hot_encoder = CountVectorizer(binary=True)
textual_one_hot_encoder.fit(corpus)
vectorized_text = textual_one_hot_encoder.transform(corpus)
print(vectorized_text.todense())
In [32]:
print (textual_one_hot_encoder.get_feature_names())
In [33]:
print (textual_one_hot_encoder.transform(['John went home today']).todense())
In [34]:
from sklearn.feature_extraction.text import HashingVectorizer
hashing_trick = HashingVectorizer(n_features=11, binary=True, norm=None, non_negative=True)
M = hashing_trick.transform(corpus)
print (M.todense())
In [35]:
print (hashing_trick.transform(['John went home today']).todense())
In [36]:
boston = load_boston()
labels = boston.feature_names
X = boston.data
y = boston.target
obs = len(y)
print ("Observations: %i" % obs)
print (boston.feature_names)
In [37]:
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
linear_regression.fit(X, y)
print ('intercept : %s' % linear_regression.coef_)
print ('intercept : %0.3f' % linear_regression.intercept_)
In [38]:
from sklearn.metrics import r2_score
print ("R-squared: %0.3f" % r2_score(y, linear_regression.predict(X)))
In [39]:
residuals = y - linear_regression.predict(X)
print ("Head of residual %s" % residuals[:5])
print ("Mean of residuals: %0.3f" % np.mean(residuals))
print ("Standard deviation of residuals: %0.3f" % np.std(residuals))
In [40]:
var = 7 # the variable in position 7 is DIS
partial_residual = residuals + X[:,var] * linear_regression.coef_[var]
scatter = plt.plot(X[:,var], partial_residual, 'wo')
l = plt.xlabel(boston.feature_names[var])
l = plt.ylabel('partial residuals')
In [41]:
X_t = X.copy()
X_t[:,var] = 1./np.sqrt(X_t[:,var])
linear_regression.fit(X_t, y)
partial_residual = residuals + X_t[:,var] * linear_regression.coef_[var]
scatter = plt.plot(X_t[:,var], partial_residual, 'wo')
l = plt.xlabel(boston.feature_names[var])
l = plt.ylabel('partial residuals')
print ("R-squared: %0.3f" % r2_score(y, linear_regression.predict(X_t)))
In [42]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer
LB = LabelBinarizer()
X_t = X.copy()
edges = np.histogram(X_t[:,var], bins=20)[1]
binning = np.digitize(X_t[:,var], edges)
X_t = np.column_stack((np.delete(X_t, var, axis=1),LB.fit_transform(binning)))
linear_regression.fit(X_t, y)
print ("R-squared: %0.3f" % r2_score(y, linear_regression.predict(X_t)))
In [43]:
example = np.array([1,2,np.nan,4,5])
print (example)
In [44]:
print (np.isnan(example))
In [45]:
print (np.nan_to_num(example))
In [46]:
missing = np.isnan(example)
replacing_value = np.mean(example[~missing])
example[missing] = replacing_value
print (example)
In [47]:
from random import sample, seed
import numpy as np
seed(19)
Xm = X.copy()
missing = sample(range(len(y)), len(y)//4)
Xm[missing,5] = np.nan
print ("Header of Xm[:,5] : %s" % Xm[:10,5])
In [48]:
from sklearn.preprocessing import Imputer
impute = Imputer(missing_values = 'NaN', strategy='mean', axis=1)
print ("Header of imputed Xm[:,5] : %s" % impute.fit_transform(Xm[:,5])[0][:10])
In [49]:
missing_indicator = np.isnan(Xm[:,5]).astype(int)
print ("Header of missing indicator : %s" % missing_indicator[:10])
In [50]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
labels = boston.feature_names
X = dataset
y = boston.target
In [51]:
boxplot = plt.boxplot(y,labels=('y'))
In [52]:
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
linear_regression.fit(X, y)
SSE = linear_regression.residues_
RMSE = np.sqrt(linear_regression.residues_ / (X.shape[0] - X.shape[1] -1))
standardized_residuals = (y - linear_regression.predict(X)) / RMSE
scatter = plt.plot(linear_regression.predict(X), standardized_residuals, 'wo')
plt.plot([-10,50],[0,0], "r-")
plt.plot([-10,50],[3,3], "r--")
plt.plot([-10,50],[-3,-3], "r--")
plt.xlabel('fitted values')
plt.ylabel('standardized residuals')
plt.show()
In [53]:
standardization = StandardScaler(with_mean=True, with_std=True)
Xs = standardization.fit_transform(X)
boxplot = plt.boxplot(Xs[:,0:7],labels=labels[0:7])
In [54]:
boxplot = plt.boxplot(Xs[:,7:13],labels=labels[7:13])
In [55]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(Xs)
C = pca.transform(Xs)
print (pca.explained_variance_ratio_)
In [56]:
import numpy as np
import matplotlib.pyplot as plt
explained_variance = pca.explained_variance_ratio_
plt.title('Portion of explained variance by component')
range_ = [r+1 for r in range(len(explained_variance))]
plt.bar(range_,explained_variance, color="b", alpha=0.4, align="center")
plt.plot(range_,explained_variance,'ro-')
for pos, pct in enumerate(explained_variance):
plt.annotate(str(round(pct,2)), (pos+1,pct+0.007))
plt.xticks(range_)
plt.show()
In [57]:
scatter = plt.scatter(C[:,0],C[:,1], facecolors='none', edgecolors='black')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
Out[57]:
In [58]:
scatter = plt.scatter(C[:,0],C[:,2], facecolors='none', edgecolors='black')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 3')
Out[58]: